{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "import pandas as pd\n",
    "import random\n",
    "import numpy as np\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>article_id</th>\n",
       "      <th>product_code</th>\n",
       "      <th>prod_name</th>\n",
       "      <th>product_type_no</th>\n",
       "      <th>product_type_name</th>\n",
       "      <th>product_group_name</th>\n",
       "      <th>graphical_appearance_no</th>\n",
       "      <th>graphical_appearance_name</th>\n",
       "      <th>colour_group_code</th>\n",
       "      <th>colour_group_name</th>\n",
       "      <th>...</th>\n",
       "      <th>department_name</th>\n",
       "      <th>index_code</th>\n",
       "      <th>index_name</th>\n",
       "      <th>index_group_no</th>\n",
       "      <th>index_group_name</th>\n",
       "      <th>section_no</th>\n",
       "      <th>section_name</th>\n",
       "      <th>garment_group_no</th>\n",
       "      <th>garment_group_name</th>\n",
       "      <th>detail_desc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0108775015</td>\n",
       "      <td>0108775</td>\n",
       "      <td>Strap top</td>\n",
       "      <td>253</td>\n",
       "      <td>Vest top</td>\n",
       "      <td>Garment Upper body</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>09</td>\n",
       "      <td>Black</td>\n",
       "      <td>...</td>\n",
       "      <td>Jersey Basic</td>\n",
       "      <td>A</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>16</td>\n",
       "      <td>Womens Everyday Basics</td>\n",
       "      <td>1002</td>\n",
       "      <td>Jersey Basic</td>\n",
       "      <td>Jersey top with narrow shoulder straps.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0108775044</td>\n",
       "      <td>0108775</td>\n",
       "      <td>Strap top</td>\n",
       "      <td>253</td>\n",
       "      <td>Vest top</td>\n",
       "      <td>Garment Upper body</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>10</td>\n",
       "      <td>White</td>\n",
       "      <td>...</td>\n",
       "      <td>Jersey Basic</td>\n",
       "      <td>A</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>16</td>\n",
       "      <td>Womens Everyday Basics</td>\n",
       "      <td>1002</td>\n",
       "      <td>Jersey Basic</td>\n",
       "      <td>Jersey top with narrow shoulder straps.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0108775051</td>\n",
       "      <td>0108775</td>\n",
       "      <td>Strap top (1)</td>\n",
       "      <td>253</td>\n",
       "      <td>Vest top</td>\n",
       "      <td>Garment Upper body</td>\n",
       "      <td>1010017</td>\n",
       "      <td>Stripe</td>\n",
       "      <td>11</td>\n",
       "      <td>Off White</td>\n",
       "      <td>...</td>\n",
       "      <td>Jersey Basic</td>\n",
       "      <td>A</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>16</td>\n",
       "      <td>Womens Everyday Basics</td>\n",
       "      <td>1002</td>\n",
       "      <td>Jersey Basic</td>\n",
       "      <td>Jersey top with narrow shoulder straps.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0110065001</td>\n",
       "      <td>0110065</td>\n",
       "      <td>OP T-shirt (Idro)</td>\n",
       "      <td>306</td>\n",
       "      <td>Bra</td>\n",
       "      <td>Underwear</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>09</td>\n",
       "      <td>Black</td>\n",
       "      <td>...</td>\n",
       "      <td>Clean Lingerie</td>\n",
       "      <td>B</td>\n",
       "      <td>Lingeries/Tights</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>61</td>\n",
       "      <td>Womens Lingerie</td>\n",
       "      <td>1017</td>\n",
       "      <td>Under-, Nightwear</td>\n",
       "      <td>Microfibre T-shirt bra with underwired, moulde...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0110065002</td>\n",
       "      <td>0110065</td>\n",
       "      <td>OP T-shirt (Idro)</td>\n",
       "      <td>306</td>\n",
       "      <td>Bra</td>\n",
       "      <td>Underwear</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>10</td>\n",
       "      <td>White</td>\n",
       "      <td>...</td>\n",
       "      <td>Clean Lingerie</td>\n",
       "      <td>B</td>\n",
       "      <td>Lingeries/Tights</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>61</td>\n",
       "      <td>Womens Lingerie</td>\n",
       "      <td>1017</td>\n",
       "      <td>Under-, Nightwear</td>\n",
       "      <td>Microfibre T-shirt bra with underwired, moulde...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105537</th>\n",
       "      <td>0953450001</td>\n",
       "      <td>0953450</td>\n",
       "      <td>5pk regular Placement1</td>\n",
       "      <td>302</td>\n",
       "      <td>Socks</td>\n",
       "      <td>Socks &amp; Tights</td>\n",
       "      <td>1010014</td>\n",
       "      <td>Placement print</td>\n",
       "      <td>09</td>\n",
       "      <td>Black</td>\n",
       "      <td>...</td>\n",
       "      <td>Socks Bin</td>\n",
       "      <td>F</td>\n",
       "      <td>Menswear</td>\n",
       "      <td>3</td>\n",
       "      <td>Menswear</td>\n",
       "      <td>26</td>\n",
       "      <td>Men Underwear</td>\n",
       "      <td>1021</td>\n",
       "      <td>Socks and Tights</td>\n",
       "      <td>Socks in a fine-knit cotton blend with a small...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105538</th>\n",
       "      <td>0953763001</td>\n",
       "      <td>0953763</td>\n",
       "      <td>SPORT Malaga tank</td>\n",
       "      <td>253</td>\n",
       "      <td>Vest top</td>\n",
       "      <td>Garment Upper body</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>09</td>\n",
       "      <td>Black</td>\n",
       "      <td>...</td>\n",
       "      <td>Jersey</td>\n",
       "      <td>A</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>2</td>\n",
       "      <td>H&amp;M+</td>\n",
       "      <td>1005</td>\n",
       "      <td>Jersey Fancy</td>\n",
       "      <td>Loose-fitting sports vest top in ribbed fast-d...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105539</th>\n",
       "      <td>0956217002</td>\n",
       "      <td>0956217</td>\n",
       "      <td>Cartwheel dress</td>\n",
       "      <td>265</td>\n",
       "      <td>Dress</td>\n",
       "      <td>Garment Full body</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>09</td>\n",
       "      <td>Black</td>\n",
       "      <td>...</td>\n",
       "      <td>Jersey</td>\n",
       "      <td>A</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>18</td>\n",
       "      <td>Womens Trend</td>\n",
       "      <td>1005</td>\n",
       "      <td>Jersey Fancy</td>\n",
       "      <td>Short, A-line dress in jersey with a round nec...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105540</th>\n",
       "      <td>0957375001</td>\n",
       "      <td>0957375</td>\n",
       "      <td>CLAIRE HAIR CLAW</td>\n",
       "      <td>72</td>\n",
       "      <td>Hair clip</td>\n",
       "      <td>Accessories</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>09</td>\n",
       "      <td>Black</td>\n",
       "      <td>...</td>\n",
       "      <td>Small Accessories</td>\n",
       "      <td>D</td>\n",
       "      <td>Divided</td>\n",
       "      <td>2</td>\n",
       "      <td>Divided</td>\n",
       "      <td>52</td>\n",
       "      <td>Divided Accessories</td>\n",
       "      <td>1019</td>\n",
       "      <td>Accessories</td>\n",
       "      <td>Large plastic hair claw.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105541</th>\n",
       "      <td>0959461001</td>\n",
       "      <td>0959461</td>\n",
       "      <td>Lounge dress</td>\n",
       "      <td>265</td>\n",
       "      <td>Dress</td>\n",
       "      <td>Garment Full body</td>\n",
       "      <td>1010016</td>\n",
       "      <td>Solid</td>\n",
       "      <td>11</td>\n",
       "      <td>Off White</td>\n",
       "      <td>...</td>\n",
       "      <td>Jersey</td>\n",
       "      <td>A</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>1</td>\n",
       "      <td>Ladieswear</td>\n",
       "      <td>18</td>\n",
       "      <td>Womens Trend</td>\n",
       "      <td>1005</td>\n",
       "      <td>Jersey Fancy</td>\n",
       "      <td>Calf-length dress in ribbed jersey made from a...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>105542 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        article_id product_code               prod_name product_type_no  \\\n",
       "0       0108775015      0108775               Strap top             253   \n",
       "1       0108775044      0108775               Strap top             253   \n",
       "2       0108775051      0108775           Strap top (1)             253   \n",
       "3       0110065001      0110065       OP T-shirt (Idro)             306   \n",
       "4       0110065002      0110065       OP T-shirt (Idro)             306   \n",
       "...            ...          ...                     ...             ...   \n",
       "105537  0953450001      0953450  5pk regular Placement1             302   \n",
       "105538  0953763001      0953763       SPORT Malaga tank             253   \n",
       "105539  0956217002      0956217         Cartwheel dress             265   \n",
       "105540  0957375001      0957375        CLAIRE HAIR CLAW              72   \n",
       "105541  0959461001      0959461            Lounge dress             265   \n",
       "\n",
       "       product_type_name  product_group_name graphical_appearance_no  \\\n",
       "0               Vest top  Garment Upper body                 1010016   \n",
       "1               Vest top  Garment Upper body                 1010016   \n",
       "2               Vest top  Garment Upper body                 1010017   \n",
       "3                    Bra           Underwear                 1010016   \n",
       "4                    Bra           Underwear                 1010016   \n",
       "...                  ...                 ...                     ...   \n",
       "105537             Socks      Socks & Tights                 1010014   \n",
       "105538          Vest top  Garment Upper body                 1010016   \n",
       "105539             Dress   Garment Full body                 1010016   \n",
       "105540         Hair clip         Accessories                 1010016   \n",
       "105541             Dress   Garment Full body                 1010016   \n",
       "\n",
       "       graphical_appearance_name colour_group_code colour_group_name  ...  \\\n",
       "0                          Solid                09             Black  ...   \n",
       "1                          Solid                10             White  ...   \n",
       "2                         Stripe                11         Off White  ...   \n",
       "3                          Solid                09             Black  ...   \n",
       "4                          Solid                10             White  ...   \n",
       "...                          ...               ...               ...  ...   \n",
       "105537           Placement print                09             Black  ...   \n",
       "105538                     Solid                09             Black  ...   \n",
       "105539                     Solid                09             Black  ...   \n",
       "105540                     Solid                09             Black  ...   \n",
       "105541                     Solid                11         Off White  ...   \n",
       "\n",
       "          department_name index_code        index_name index_group_no  \\\n",
       "0            Jersey Basic          A        Ladieswear              1   \n",
       "1            Jersey Basic          A        Ladieswear              1   \n",
       "2            Jersey Basic          A        Ladieswear              1   \n",
       "3          Clean Lingerie          B  Lingeries/Tights              1   \n",
       "4          Clean Lingerie          B  Lingeries/Tights              1   \n",
       "...                   ...        ...               ...            ...   \n",
       "105537          Socks Bin          F          Menswear              3   \n",
       "105538             Jersey          A        Ladieswear              1   \n",
       "105539             Jersey          A        Ladieswear              1   \n",
       "105540  Small Accessories          D           Divided              2   \n",
       "105541             Jersey          A        Ladieswear              1   \n",
       "\n",
       "       index_group_name section_no            section_name garment_group_no  \\\n",
       "0            Ladieswear         16  Womens Everyday Basics             1002   \n",
       "1            Ladieswear         16  Womens Everyday Basics             1002   \n",
       "2            Ladieswear         16  Womens Everyday Basics             1002   \n",
       "3            Ladieswear         61         Womens Lingerie             1017   \n",
       "4            Ladieswear         61         Womens Lingerie             1017   \n",
       "...                 ...        ...                     ...              ...   \n",
       "105537         Menswear         26           Men Underwear             1021   \n",
       "105538       Ladieswear          2                    H&M+             1005   \n",
       "105539       Ladieswear         18            Womens Trend             1005   \n",
       "105540          Divided         52     Divided Accessories             1019   \n",
       "105541       Ladieswear         18            Womens Trend             1005   \n",
       "\n",
       "       garment_group_name                                        detail_desc  \n",
       "0            Jersey Basic            Jersey top with narrow shoulder straps.  \n",
       "1            Jersey Basic            Jersey top with narrow shoulder straps.  \n",
       "2            Jersey Basic            Jersey top with narrow shoulder straps.  \n",
       "3       Under-, Nightwear  Microfibre T-shirt bra with underwired, moulde...  \n",
       "4       Under-, Nightwear  Microfibre T-shirt bra with underwired, moulde...  \n",
       "...                   ...                                                ...  \n",
       "105537   Socks and Tights  Socks in a fine-knit cotton blend with a small...  \n",
       "105538       Jersey Fancy  Loose-fitting sports vest top in ribbed fast-d...  \n",
       "105539       Jersey Fancy  Short, A-line dress in jersey with a round nec...  \n",
       "105540        Accessories                           Large plastic hair claw.  \n",
       "105541       Jersey Fancy  Calf-length dress in ribbed jersey made from a...  \n",
       "\n",
       "[105542 rows x 25 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"./Data/articles.csv\", sep=',', dtype=str)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['article_id', 'product_code', 'prod_name', 'product_type_no',\n",
       "       'product_type_name', 'product_group_name', 'graphical_appearance_no',\n",
       "       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',\n",
       "       'perceived_colour_value_id', 'perceived_colour_value_name',\n",
       "       'perceived_colour_master_id', 'perceived_colour_master_name',\n",
       "       'department_no', 'department_name', 'index_code', 'index_name',\n",
       "       'index_group_no', 'index_group_name', 'section_no', 'section_name',\n",
       "       'garment_group_no', 'garment_group_name', 'detail_desc'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(105542, 47224, (105542,), (105542,))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from bidict import bidict\n",
    "artid_bidict, pcode_bidict = bidict(), bidict()\n",
    "# start counting from 1, because 0 is reserved for mask value\n",
    "artid_cnt, pcode_cnt = 0, 0\n",
    "artid, pcode = [], []\n",
    "for id, code in zip(df['article_id'], df['product_code']):\n",
    "    if id not in artid_bidict:\n",
    "        artid_cnt += 1\n",
    "        artid_bidict[id] = artid_cnt\n",
    "    if code not in pcode_bidict:\n",
    "        pcode_cnt += 1\n",
    "        pcode_bidict[code] = pcode_cnt\n",
    "    artid.append(artid_bidict[id])\n",
    "    pcode.append(pcode_bidict[code])\n",
    "artid = np.array(artid, dtype='int')\n",
    "pcode = np.array(pcode, dtype='int')\n",
    "artid_cnt, pcode_cnt, artid.shape, pcode.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([     1,      2,      3, ..., 105540, 105541, 105542]),\n",
       " array([    1,     1,     1, ..., 47222, 47223, 47224]))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "artid, pcode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./Data/Processed/article_sparse.pkl', 'wb') as f:\n",
    "    pkl.dump(artid, f, pkl.HIGHEST_PROTOCOL)\n",
    "    pkl.dump(pcode, f, pkl.HIGHEST_PROTOCOL)\n",
    "    pkl.dump(artid_bidict, f, pkl.HIGHEST_PROTOCOL)\n",
    "    pkl.dump(pcode_bidict, f, pkl.HIGHEST_PROTOCOL)\n",
    "    pkl.dump((artid_cnt, pcode_cnt), f, pkl.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "7c2e65e64076883662e9fbb467097aa6d81839e6b77012bdd0b6d8c4fbfb9623"
  },
  "kernelspec": {
   "display_name": "Python 3.7.11 ('SR-GNN')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
